半次元图片爬虫

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
import requests
import json
import re
import os
import threading
from lxml import etree

def Requests(url):
    head = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3741.400 QQBrowser/10.5.3863.400'}
    n = 0
    while True:
        n += 1
        try:
            response = requests.get(url, headers=head, timeout=10)
        except:
            pass
        else:
            if response.status_code == 200:
                return response
            if n >= 10:
                return False


def folder_mkdir(title):
    if os.path.exists(os.getcwd() + '\\'+title+"\\"):
        pass
    else:
        os.mkdir(os.getcwd() + '\\'+title+"\\")

    folder = os.getcwd() +'\\'+ title+"\\"

    return folder


def install_img(url, folder, name):
    try:
        img_content = Requests(url).content
    except:
        print('错误:{}     名称:{}'.fomat(url, name))
    else:
        open(folder + name, 'wb').write(img_content)


def get_data(item_host_url):
    mode = 'thread'  # 多线程下载模式
    item_response = Requests(item_host_url)
    response = etree.HTML(item_response.text)
    title = response.xpath('//title/text()')[0]


    folder = folder_mkdir(title)

    # print(item_response.text)
    if item_response == False:
        return
    item_response.encoding = 'utf-8'
    try:
        item_data = \
        re.findall('window.__ssr_data = JSON.parse\("(.*?)"\);\n      window._UID_ = \'0\';', item_response.text)[
            0].replace('\\"', '"').replace('u002F', '').replace('\\\\', '/')
        # print(item_data)
    except:
        pass
    else:
        item_img_data = json.loads(item_data, strict=False)['detail']['post_data']['multi']
        # print(item_img_data)
        num = len(os.listdir(folder))  #避免覆盖的
        # print(num)
        for img_data in item_img_data:
            img_url = img_data['original_path']
            print(img_url)
            if img_url.find('jpg') >= 0:
                img_fomat = '.jpg'
            elif img_url.find('png') >= 0:
                img_fomat = '.png'
            else:
                img_fomat = '.jpg'

            num += 1
            name = str(num) + img_fomat
            if mode == 'thread':
                t = threading.Thread(target=install_img, args=(img_url, folder, name)).start()
                while True:
                    if len(threading.enumerate()) <= 25:
                        break
            else:
                install_img(img_url, folder, name)


if __name__ == '__main__':
    while True:
        item_host_url = input("请输入url地址:")

    # item_host_url = 'https://bcy.net/item/detail/6625993367937351944?_source_page=cos'
        get_data(item_host_url)
        print('下载完成')

Requests()

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
import requests
def Requests(url):
    head = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3741.400 QQBrowser/10.5.3863.400'}
    n = 0    #计算请求次数,错误过多则跳过此链接
    while True:
        n += 1
        try:
            response = requests.get(url,headers = head,timeout = 10)
        except:
            pass
        else:
            if response.status_code == 200:
                return response
            if n >= 10:
                return False

构建一个常用的请求模块,在错误时重新请求,超过一定请求次数后跳过

folder_mkdir()

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
import os
def folder_mkdir():
    if os.path.exists(os.getcwd()+'\\pic'):
        pass
    else:
        os.mkdir(os.getcwd()+'\\pic')

    folder = os.getcwd()+'\\pic\\'

    return folder

创建一个空文件夹用于储存图片

get_data()

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
def get_data(item_host_url):
    mode = 'thread'  #多线程下载模式
    folder = folder_mkdir()
    item_response = Requests(item_host_url)
    if item_response == False:
        return
    item_response.encoding = 'utf-8'
    try:
        item_data = re.findall('window.__ssr_data = JSON.parse\("(.*?)"\);\n      window._UID_ = \'0\';',item_response.text)[0].replace('\\"','"').replace('u002F','').replace('\\\\','/')
    except:
        pass
    else:
        item_img_data = json.loads(item_data,strict=False)['detail']['post_data']['multi']
        num = len(os.listdir(folder))
        print(num)
        for img_data in item_img_data:
            img_url = img_data['original_path']
            if img_url.find('jpg') >= 0:
                img_fomat = '.jpg'
            elif img_url.find('png') >= 0:
                img_fomat = '.png'
            else:
                img_fomat = '.jpg'

            num += 1
            name = str(num)+img_fomat
            if mode == 'thread':
                t = threading.Thread(target = install_img,args = (img_url,folder,name)).start()
                while True:
                    if len(threading.enumerate()) <= 25:
                        break
            else:
                install_img(img_url,folder,name)

请求图集链接,获取网页源码,从